package component.fulltextsearch.index.handlingtypes;

import java.io.IOException;
import java.io.InputStream;

import org.pdfbox.pdmodel.PDDocument;
import org.pdfbox.util.PDFTextStripper;


public class PDFBoxPDFHandler implements IInputStreamHandler {
	public String getText(InputStream is) throws InputStreamHandlerException {
		String bodyText = null;
		PDDocument document = null;
		try {
			document = PDDocument.load(is);
			
			// 通过PDFTextStripper来提取文本
			PDFTextStripper stripper = new PDFTextStripper();
			stripper.setSortByPosition(false);// 设置是否排序
			stripper.setStartPage(1);// 设置起始页
			stripper.setEndPage(Integer.MAX_VALUE);// 设置结束页
			
			bodyText = stripper.getText(document);
		} catch (IOException e) {
			throw new InputStreamHandlerException("不能解析PDF文件", e);
		} finally {			
			try {
				if(document!=null) document.close();
			} catch (IOException e) {	
				e.printStackTrace();
			}
		}
		return bodyText;
	}
}
